In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json
import os
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Dense, Embedding, Activation, LSTM, SimpleRNN, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from tensorflow.keras import backend as K
import tensorflow as tf
import tensorflow_hub as hub
print("TensorFlow Version:",tf.__version__)
print("Hub version: ",hub.__version__)
# Params for bert model and tokenization


In [None]:
class LoadingData():
            
    def __init__(self):
        train_file_path = os.path.join("..","input","nlp-benchmarking-data-for-intent-and-entity","benchmarking_data","Train")
        validation_file_path = os.path.join("..","input","nlp-benchmarking-data-for-intent-and-entity","benchmarking_data","Validate")
        category_id = 0
        self.cat_to_intent = {}
        self.intent_to_cat = {}
        
        for dirname, _, filenames in os.walk(train_file_path):
            for filename in filenames:
                file_path = os.path.join(dirname, filename)
                intent_id = filename.replace(".json","")
                self.cat_to_intent[category_id] = intent_id
                self.intent_to_cat[intent_id] = category_id
                category_id+=1
        print(self.cat_to_intent)
        print(self.intent_to_cat)
        '''Training data'''
        training_data = list() 
        for dirname, _, filenames in os.walk(train_file_path):
            for filename in filenames:
                file_path = os.path.join(dirname, filename)
                intent_id = filename.replace(".json","")
                training_data+=self.make_data_for_intent_from_json(file_path,intent_id,self.intent_to_cat[intent_id])
        self.train_data_frame = pd.DataFrame(training_data, columns =['query', 'intent','category'])   
        
        self.train_data_frame = self.train_data_frame.sample(frac = 1)


        
        '''Validation data'''
        validation_data = list()    
        for dirname, _, filenames in os.walk(validation_file_path):
            for filename in filenames:
                file_path = os.path.join(dirname, filename)
                intent_id = filename.replace(".json","")
                validation_data +=self.make_data_for_intent_from_json(file_path,intent_id,self.intent_to_cat[intent_id])                
        self.validation_data_frame = pd.DataFrame(validation_data, columns =['query', 'intent','category'])

        self.validation_data_frame = self.validation_data_frame.sample(frac = 1)
        
        
    def make_data_for_intent_from_json(self,json_file,intent_id,cat):
        json_d = json.load(open(json_file))         
        
        json_dict = json_d[intent_id]

        sent_list = list()
        for i in json_dict:
            each_list = i['data']
            sent =""
            for i in each_list:
                sent = sent + i['text']+ " "
            sent =sent[:-1]
            for i in range(3):
                sent = sent.replace("  "," ")
            sent_list.append((sent,intent_id,cat))
        return sent_list
            

In [None]:
load_data_obj = LoadingData()

In [None]:
load_data_obj.train_data_frame.head()

In [None]:
load_data_obj.train_data_frame

In [None]:
load_data_obj.validation_data_frame.head().values

In [None]:
load_data_obj.train_data_frame.head()

# LSTM

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Initialize the tokenizer
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(load_data_obj.train_data_frame['query'])

# Convert text to sequence of integers
train_sequences = tokenizer.texts_to_sequences(load_data_obj.train_data_frame['query'])
validation_sequences = tokenizer.texts_to_sequences(load_data_obj.validation_data_frame['query'])

# Pad sequences to ensure uniform length
max_length = max([len(x) for x in train_sequences])
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding='post')


In [None]:
import pandas as pd
from tensorflow.keras.utils import to_categorical

# Convert labels to one-hot encoding
train_labels = to_categorical(load_data_obj.train_data_frame['category'])
validation_labels = to_categorical(load_data_obj.validation_data_frame['category'])


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional

# Assuming max_length is defined here, e.g., max_length = max([len(x) for x in train_sequences])
max_length = max([len(x) for x in train_sequences])

model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=32, input_length=max_length))  # Adjusted output_dim
model.add(Bidirectional(LSTM(128, return_sequences=False)))  # Increased LSTM units and added Bidirectional layer
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))  # Adjusted dropout rate
model.add(Dense(len(train_labels[0]), activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


In [None]:
history = model.fit(train_padded, train_labels, epochs=10, validation_data=(validation_padded, validation_labels))


In [None]:
loss, accuracy = model.evaluate(validation_padded, validation_labels)
print(f'Validation loss: {loss}, Validation accuracy: {accuracy}')


In [None]:

# Create a DataFrame for results
results_df = pd.DataFrame(columns=['model_name', 'validation_accuracy'])

# Insert data into the DataFrame
results_df = results_df.append({'model_name': 'LSTM', 'validation_accuracy': accuracy}, ignore_index=True)

# Display the DataFrame
print(results_df)


# RANDOM FOREST


In [None]:
train_data_frame=load_data_obj.train_data_frame
validation_data_frame=load_data_obj.validation_data_frame

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
train_tfidf = tfidf_vectorizer.fit_transform(train_data_frame['query'])
validation_tfidf = tfidf_vectorizer.transform(validation_data_frame['query'])

# Encode labels
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_data_frame['category'])
validation_labels_encoded = label_encoder.transform(validation_data_frame['category'])


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(train_tfidf, train_labels_encoded)


In [None]:
# Predict on validation set
validation_predictions = rf_classifier.predict(validation_tfidf)

# Calculate accuracy
validation_accuracy = accuracy_score(validation_labels_encoded, validation_predictions)
print(f'Validation Accuracy of Random Forest: {validation_accuracy * 100:.2f}%')


In [None]:
# Insert data into the DataFrame
results_df = results_df.append({'model_name': 'Random Forest', 'validation_accuracy': validation_accuracy}, ignore_index=True)

# Display the DataFrame
print(results_df)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Initialize the Logistic Regression Classifier
log_reg_classifier = LogisticRegression(max_iter=1000)  # Increase max_iter if the model doesn't converge

# Train the model
log_reg_classifier.fit(train_tfidf, train_labels_encoded)


In [None]:
# Predict on validation set
validation_predictions = log_reg_classifier.predict(validation_tfidf)

# Calculate accuracy
validation_accuracy = accuracy_score(validation_labels_encoded, validation_predictions)
print(f'Validation Accuracy of Logistic Regression: {validation_accuracy * 100:.2f}%')


In [None]:
# Insert data into the DataFrame
results_df = results_df.append({'model_name': 'Logistic Regression', 'validation_accuracy': validation_accuracy}, ignore_index=True)

# Display the DataFrame
print(results_df)


# GRU

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Initialize the model
model = Sequential()

# Add an Embedding layer
model.add(Embedding(input_dim=5000, output_dim=16, input_length=max_length))

# First GRU layer with Dropout regularization
model.add(GRU(units=50, return_sequences=True, activation='tanh'))
model.add(Dropout(0.2))

# Second GRU layer
model.add(GRU(units=50, return_sequences=True, activation='tanh'))
model.add(Dropout(0.2))

# Third GRU layer
model.add(GRU(units=50, return_sequences=True, activation='tanh'))
model.add(Dropout(0.2))

# Fourth GRU layer
model.add(GRU(units=50, activation='tanh'))
model.add(Dropout(0.2))

# Output layer for classification (units = number of classes, softmax activation)
model.add(Dense(units=len(train_labels[0]), activation='softmax'))  # Adjust the units based on the number of classes

# Compile the model for classification
model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()


In [None]:
# Train the model
history = model.fit(train_padded, train_labels, epochs=10, validation_data=(validation_padded, validation_labels))


In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(validation_padded, validation_labels)
print(f'Validation loss: {loss}, Validation accuracy: {accuracy}')


In [None]:
# Insert data into the DataFrame
results_df = results_df.append({'model_name': 'GRU', 'validation_accuracy': accuracy}, ignore_index=True)

# Display the DataFrame
print(results_df)


# RNN 

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Embedding, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Initialize the model
model = Sequential()

# Add an Embedding layer
model.add(Embedding(input_dim=5000, output_dim=16, input_length=max_length))

# Add a SimpleRNN layer
model.add(SimpleRNN(units=64, return_sequences=True))
model.add(Dropout(0.2))

# Add another SimpleRNN layer
model.add(SimpleRNN(units=64))
model.add(Dropout(0.2))

# Add the output Dense layer with softmax activation for multi-class classification
model.add(Dense(units=len(train_labels[0]), activation='softmax'))

# Compile the model
model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()


In [None]:
# Train the model
history = model.fit(train_padded, train_labels, epochs=10, validation_data=(validation_padded, validation_labels))


In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(validation_padded, validation_labels)
print(f'Validation loss: {loss}, Validation accuracy: {accuracy}')


In [None]:
# Insert data into the DataFrame
results_df = results_df.append({'model_name': 'RNN', 'validation_accuracy': accuracy}, ignore_index=True)

# Display the DataFrame
print(results_df)


# BERT

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from transformers import TFBertModel, BertTokenizer


# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

# Load and preprocess the data
data_train = train_data_frame[['query', 'intent']]
data_train['category'] = pd.Categorical(data_train['intent'])
data_train['intent'] = data_train['category'].cat.codes

# Load and preprocess the data
data_test = validation_data_frame[['query', 'intent']]
data_test['category'] = pd.Categorical(data_test['intent'])
data_test['intent'] = data_test['category'].cat.codes



# Extract the training and testing texts and labels
train_texts = data_train['query'].tolist()
train_labels = data_train['intent'].tolist()
test_texts = data_test['query'].tolist()
test_labels = data_test['intent'].tolist()

max_length = 128  # Adjust based on your dataset or model's max length

train_encodings = tokenizer.batch_encode_plus(
    train_texts, 
    truncation=True, 
    padding='max_length', 
    max_length=max_length, 
    return_tensors="tf",
    pad_to_max_length=True  # Explicitly enforce padding to max_length
)

test_encodings = tokenizer.batch_encode_plus(
    test_texts, 
    truncation=True, 
    padding='max_length', 
    max_length=max_length, 
    return_tensors="tf",
    pad_to_max_length=True  # Explicitly enforce padding to max_length
)



# Convert the labels to one-hot encoding
num_labels = len(data_train['category'].cat.categories)
train_labels_encoded = tf.one_hot(train_labels, num_labels)
test_labels_encoded = tf.one_hot(test_labels, num_labels)

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels_encoded))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), test_labels_encoded))

# Define the model architecture
input_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name='input_ids')
attention_mask = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name='attention_mask')
output = model(input_ids, attention_mask=attention_mask)[0]
output = tf.keras.layers.Dense(num_labels, activation='softmax')(output[:, 0, :])  # Pooling the output
model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)

# Compile and train the model
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
metrics = ['accuracy']



# Use smaller batch size
batch_size = 20



model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
model.fit(train_dataset.batch(batch_size), epochs=10)

# Evaluate the model
model.evaluate(test_dataset.batch(batch_size))

In [None]:
evaluation_results = model.evaluate(test_dataset.batch(batch_size))
accuracy = evaluation_results[1]  # Assuming accuracy is the second metric in the metrics list


In [None]:
# Insert data into the DataFrame
results_df = results_df.append({'model_name': 'BERT', 'validation_accuracy': accuracy}, ignore_index=True)

# Display the DataFrame
print(results_df)


# ROBERTA


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from transformers import TFRobertaModel, RobertaTokenizer

#Load the tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaModel.from_pretrained('roberta-base')



# Load and preprocess the data
data_train = train_data_frame[['query', 'intent']]
data_train['category'] = pd.Categorical(data_train['intent'])
data_train['intent'] = data_train['category'].cat.codes

# Load and preprocess the data
data_test = validation_data_frame[['query', 'intent']]
data_test['category'] = pd.Categorical(data_test['intent'])
data_test['intent'] = data_test['category'].cat.codes



# Extract the training and testing texts and labels
train_texts = data_train['query'].tolist()
train_labels = data_train['intent'].tolist()
test_texts = data_test['query'].tolist()
test_labels = data_test['intent'].tolist()

max_length = 128  # Adjust based on your dataset or model's max length

train_encodings = tokenizer.batch_encode_plus(
    train_texts, 
    truncation=True, 
    padding='max_length', 
    max_length=max_length, 
    return_tensors="tf",
    pad_to_max_length=True  # Explicitly enforce padding to max_length
)

test_encodings = tokenizer.batch_encode_plus(
    test_texts, 
    truncation=True, 
    padding='max_length', 
    max_length=max_length, 
    return_tensors="tf",
    pad_to_max_length=True  # Explicitly enforce padding to max_length
)



# Convert the labels to one-hot encoding
num_labels = len(data_train['category'].cat.categories)
train_labels_encoded = tf.one_hot(train_labels, num_labels)
test_labels_encoded = tf.one_hot(test_labels, num_labels)

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels_encoded))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), test_labels_encoded))

#Define the model architecture
input_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name='input_ids')
attention_mask = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name='attention_mask')
output = model(input_ids, attention_mask=attention_mask)[0]
output = tf.keras.layers.Dense(num_labels, activation='softmax')(output[:, 0, :]) # Pooling the output
model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)

#Compile and train the model
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
metrics = ['accuracy']


# Use smaller batch size
batch_size = 20



model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
model.fit(train_dataset.batch(batch_size), epochs=10)

# Evaluate the model

evaluation_results = model.evaluate(test_dataset.batch(batch_size))
accuracy = evaluation_results[1]  # Assuming accuracy is the second metric in the metrics list

In [None]:
# Insert data into the DataFrame
results_df = results_df.append({'model_name': 'Roberta', 'validation_accuracy': accuracy}, ignore_index=True)

# Display the DataFrame
print(results_df)


# XLnet 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from transformers import TFXLNetModel, XLNetTokenizer

# Load the tokenizer and model
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = TFXLNetModel.from_pretrained('xlnet-base-cased')




# Load and preprocess the data
data_train = train_data_frame[['query', 'intent']]
data_train['category'] = pd.Categorical(data_train['intent'])
data_train['intent'] = data_train['category'].cat.codes

# Load and preprocess the data
data_test = validation_data_frame[['query', 'intent']]
data_test['category'] = pd.Categorical(data_test['intent'])
data_test['intent'] = data_test['category'].cat.codes



# Extract the training and testing texts and labels
train_texts = data_train['query'].tolist()
train_labels = data_train['intent'].tolist()
test_texts = data_test['query'].tolist()
test_labels = data_test['intent'].tolist()

max_length = 128  # Adjust based on your dataset or model's max length

train_encodings = tokenizer.batch_encode_plus(
    train_texts, 
    truncation=True, 
    padding='max_length', 
    max_length=max_length, 
    return_tensors="tf",
    pad_to_max_length=True  # Explicitly enforce padding to max_length
)

test_encodings = tokenizer.batch_encode_plus(
    test_texts, 
    truncation=True, 
    padding='max_length', 
    max_length=max_length, 
    return_tensors="tf",
    pad_to_max_length=True  # Explicitly enforce padding to max_length
)



# Convert the labels to one-hot encoding
num_labels = len(data_train['category'].cat.categories)
train_labels_encoded = tf.one_hot(train_labels, num_labels)
test_labels_encoded = tf.one_hot(test_labels, num_labels)

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels_encoded))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), test_labels_encoded))

#Define the model architecture
input_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name='input_ids')
attention_mask = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name='attention_mask')
output = model(input_ids, attention_mask=attention_mask)[0]
output = tf.keras.layers.Dense(num_labels, activation='softmax')(output[:, 0, :]) # Pooling the output
model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)

#Compile and train the model
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
metrics = ['accuracy']


# Use smaller batch size
batch_size = 20



model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
model.fit(train_dataset.batch(batch_size), epochs=10)

# Evaluate the model

evaluation_results = model.evaluate(test_dataset.batch(batch_size))
accuracy = evaluation_results[1]  # Assuming accuracy is the second metric in the metrics list

In [None]:
# Insert data into the DataFrame
results_df = results_df.append({'model_name': 'XLnet', 'validation_accuracy': accuracy}, ignore_index=True)

# Display the DataFrame
print(results_df)


# DistilBert


In [None]:
from transformers import TFDistilBertModel, DistilBertTokenizer

#Load the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')




# Load and preprocess the data
data_train = train_data_frame[['query', 'intent']]
data_train['category'] = pd.Categorical(data_train['intent'])
data_train['intent'] = data_train['category'].cat.codes

# Load and preprocess the data
data_test = validation_data_frame[['query', 'intent']]
data_test['category'] = pd.Categorical(data_test['intent'])
data_test['intent'] = data_test['category'].cat.codes



# Extract the training and testing texts and labels
train_texts = data_train['query'].tolist()
train_labels = data_train['intent'].tolist()
test_texts = data_test['query'].tolist()
test_labels = data_test['intent'].tolist()

max_length = 128  # Adjust based on your dataset or model's max length

train_encodings = tokenizer.batch_encode_plus(
    train_texts, 
    truncation=True, 
    padding='max_length', 
    max_length=max_length, 
    return_tensors="tf",
    pad_to_max_length=True  # Explicitly enforce padding to max_length
)

test_encodings = tokenizer.batch_encode_plus(
    test_texts, 
    truncation=True, 
    padding='max_length', 
    max_length=max_length, 
    return_tensors="tf",
    pad_to_max_length=True  # Explicitly enforce padding to max_length
)



# Convert the labels to one-hot encoding
num_labels = len(data_train['category'].cat.categories)
train_labels_encoded = tf.one_hot(train_labels, num_labels)
test_labels_encoded = tf.one_hot(test_labels, num_labels)

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels_encoded))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), test_labels_encoded))

#Define the model architecture
input_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name='input_ids')
attention_mask = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name='attention_mask')
output = model(input_ids, attention_mask=attention_mask)[0]
output = tf.keras.layers.Dense(num_labels, activation='softmax')(output[:, 0, :]) # Pooling the output
model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)

#Compile and train the model
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
metrics = ['accuracy']


# Use smaller batch size
batch_size = 20



model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
model.fit(train_dataset.batch(batch_size), epochs=10)

# Evaluate the model

evaluation_results = model.evaluate(test_dataset.batch(batch_size))
accuracy = evaluation_results[1]  # Assuming accuracy is the second metric in the metrics list

In [None]:
# Insert data into the DataFrame
results_df = results_df.append({'model_name': 'DistilBert', 'validation_accuracy': accuracy}, ignore_index=True)

# Display the DataFrame
print(results_df)


In [None]:
print(results_df)


# Albert

In [None]:
from transformers import TFAlbertModel, AlbertTokenizer

tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertModel.from_pretrained('albert-base-v2')


# Load and preprocess the data
data_train = train_data_frame[['query', 'intent']]
data_train['category'] = pd.Categorical(data_train['intent'])
data_train['intent'] = data_train['category'].cat.codes

# Load and preprocess the data
data_test = validation_data_frame[['query', 'intent']]
data_test['category'] = pd.Categorical(data_test['intent'])
data_test['intent'] = data_test['category'].cat.codes



# Extract the training and testing texts and labels
train_texts = data_train['query'].tolist()
train_labels = data_train['intent'].tolist()
test_texts = data_test['query'].tolist()
test_labels = data_test['intent'].tolist()

max_length = 128  # Adjust based on your dataset or model's max length

train_encodings = tokenizer.batch_encode_plus(
    train_texts, 
    truncation=True, 
    padding='max_length', 
    max_length=max_length, 
    return_tensors="tf",
    pad_to_max_length=True  # Explicitly enforce padding to max_length
)

test_encodings = tokenizer.batch_encode_plus(
    test_texts, 
    truncation=True, 
    padding='max_length', 
    max_length=max_length, 
    return_tensors="tf",
    pad_to_max_length=True  # Explicitly enforce padding to max_length
)



# Convert the labels to one-hot encoding
num_labels = len(data_train['category'].cat.categories)
train_labels_encoded = tf.one_hot(train_labels, num_labels)
test_labels_encoded = tf.one_hot(test_labels, num_labels)

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels_encoded))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), test_labels_encoded))

#Define the model architecture
input_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name='input_ids')
attention_mask = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name='attention_mask')
output = model(input_ids, attention_mask=attention_mask)[0]
output = tf.keras.layers.Dense(num_labels, activation='softmax')(output[:, 0, :]) # Pooling the output
model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)

#Compile and train the model
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
metrics = ['accuracy']


# Use smaller batch size
batch_size = 20



model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
model.fit(train_dataset.batch(batch_size), epochs=10)

# Evaluate the model

evaluation_results = model.evaluate(test_dataset.batch(batch_size))
accuracy = evaluation_results[1]  # Assuming accuracy is the second metric in the metrics list

In [None]:
# Insert data into the DataFrame
results_df = results_df.append({'model_name': 'Albert', 'validation_accuracy': accuracy}, ignore_index=True)

# Display the DataFrame
print(results_df)


# Electra

In [None]:
from transformers import TFElectraModel, ElectraTokenizer

tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
model = TFElectraModel.from_pretrained('google/electra-base-discriminator')



# Load and preprocess the data
data_train = train_data_frame[['query', 'intent']]
data_train['category'] = pd.Categorical(data_train['intent'])
data_train['intent'] = data_train['category'].cat.codes

# Load and preprocess the data
data_test = validation_data_frame[['query', 'intent']]
data_test['category'] = pd.Categorical(data_test['intent'])
data_test['intent'] = data_test['category'].cat.codes



# Extract the training and testing texts and labels
train_texts = data_train['query'].tolist()
train_labels = data_train['intent'].tolist()
test_texts = data_test['query'].tolist()
test_labels = data_test['intent'].tolist()

max_length = 128  # Adjust based on your dataset or model's max length

train_encodings = tokenizer.batch_encode_plus(
    train_texts, 
    truncation=True, 
    padding='max_length', 
    max_length=max_length, 
    return_tensors="tf",
    pad_to_max_length=True  # Explicitly enforce padding to max_length
)

test_encodings = tokenizer.batch_encode_plus(
    test_texts, 
    truncation=True, 
    padding='max_length', 
    max_length=max_length, 
    return_tensors="tf",
    pad_to_max_length=True  # Explicitly enforce padding to max_length
)



# Convert the labels to one-hot encoding
num_labels = len(data_train['category'].cat.categories)
train_labels_encoded = tf.one_hot(train_labels, num_labels)
test_labels_encoded = tf.one_hot(test_labels, num_labels)

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels_encoded))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), test_labels_encoded))

#Define the model architecture
input_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name='input_ids')
attention_mask = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name='attention_mask')
output = model(input_ids, attention_mask=attention_mask)[0]
output = tf.keras.layers.Dense(num_labels, activation='softmax')(output[:, 0, :]) # Pooling the output
model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)

#Compile and train the model
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
metrics = ['accuracy']


# Use smaller batch size
batch_size = 20



model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
model.fit(train_dataset.batch(batch_size), epochs=10)

# Evaluate the model

evaluation_results = model.evaluate(test_dataset.batch(batch_size))
accuracy = evaluation_results[1]  # Assuming accuracy is the second metric in the metrics list

In [None]:
# Insert data into the DataFrame
results_df = results_df.append({'model_name': 'Electra', 'validation_accuracy': accuracy}, ignore_index=True)

# Display the DataFrame
print(results_df)


# Final Evaluation

In [None]:
# Find the index of the row with the maximum validation accuracy
max_accuracy_index = results_df['validation_accuracy'].idxmax()

# Retrieve the model name with the maximum validation accuracy
best_model_name = results_df.loc[max_accuracy_index, 'model_name']

print(f"The best model with maximum validation accuracy is: {best_model_name}")