In [None]:
import json


In [None]:
def preprocess_data(file_path):
    with open(file_path, "r") as file:
        data = json.load(file)

    examples = []
    for dialogue in data:
        domain = dialogue["domains"][0]  # Assuming a single domain per dialogue
        for turn in dialogue["turns"]:
            if turn["speaker"] == "user":
                text = turn["utterance"]
                intent = turn["dialogue_acts"]["binary"][0]["intent"] if turn["dialogue_acts"]["binary"] else "inform"
                examples.append((text, intent, domain))

    return examples

In [None]:
dialogues = preprocess_data("drive/MyDrive/KVRET/dialogues.json")

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.preprocessing import LabelEncoder

In [None]:
# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set([example[1] for example in dialogues])))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
def tokenize_text(text, max_length=512):
    return tokenizer(text, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')

In [None]:
class IntentDataset(Dataset):
    def __init__(self, examples, tokenizer, label_encoder):
        self.examples = examples
        self.tokenizer = tokenizer
        self.label_encoder = label_encoder

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples[idx]
        input_text = example[0]  # Extract the text from the tuple
        intent = example[1]  # Extract the intent from the tuple
        
        tokenized_input = tokenize_text(input_text)
        label = self.label_encoder.transform([intent])[0]  # Encode the intent label
        
        # Squeeze the tensors to remove the extra dimension
        tokenized_input['input_ids'] = tokenized_input['input_ids'].squeeze(0)
        tokenized_input['token_type_ids'] = tokenized_input['token_type_ids'].squeeze(0)
        tokenized_input['attention_mask'] = tokenized_input['attention_mask'].squeeze(0)

        tokenized_input['labels'] = torch.tensor(label, dtype=torch.long)

        return tokenized_input


In [None]:
# Encode the intent labels
label_encoder = LabelEncoder()
intent_labels = [example[1] for example in dialogues]
label_encoder.fit(intent_labels)

In [None]:
# Instantiate the dataset
dataset = IntentDataset(dialogues, tokenizer, label_encoder)

In [None]:
# Split the data into training and testing sets
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
for epoch in range(num_epochs):
    # Training loop
    model.train()
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs[0]  # Access loss from the outputs tuple
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    # Evaluation loop
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    with torch.no_grad():
        for batch in test_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs[0]  # Access loss from the outputs tuple
            logits = outputs[1]  # Access logits from the outputs tuple

            # Calculate accuracy
            predictions = torch.argmax(logits, dim=-1)
            accuracy = (predictions == batch['labels']).sum().item() / len(predictions)
            total_eval_accuracy += accuracy
            total_eval_loss += loss.item()

    # Print evaluation metrics
    avg_eval_loss = total_eval_loss / len(test_dataloader)
    avg_eval_accuracy = total_eval_accuracy / len(test_dataloader)
    print(f"Epoch: {epoch + 1}, Loss: {avg_eval_loss}, Accuracy: {avg_eval_accuracy}")


Epoch: 1, Loss: 0.2663197937135649, Accuracy: 0.900990099009901
Epoch: 2, Loss: 0.2722726881430291, Accuracy: 0.9047029702970297
Epoch: 3, Loss: 0.27808635460563225, Accuracy: 0.8991336633663366


In [None]:
import pickle

save_directory = "drive/MyDrive/KVRET/trained_kvret_intent_model"

# Save the model
model.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

# Save the label encoder
with open(f"{save_directory}/label_encoder.pkl", "wb") as file:
    pickle.dump(label_encoder, file)


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import pickle

load_directory = "drive/MyDrive/KVRET/trained_kvret_intent_model"

# Load the model
loaded_model = BertForSequenceClassification.from_pretrained(load_directory)

# Load the tokenizer
loaded_tokenizer = BertTokenizer.from_pretrained(load_directory)

# Load the label encoder
with open(f"{load_directory}/label_encoder.pkl", "rb") as file:
    loaded_label_encoder = pickle.load(file)
